| Program | Distribution | Continuous fixed eff. | Random intercepts | Random slopes | Interactive eff. |
|---|---|---|---|---|---|
| edgeR | Negative binomial generalized linear model | ✔ | ✖ | ✖ | ✔ |
| DESeq2 | Negative binomial generalized linear model | ✔ | ✖ | ✖ | ✔ |
| Voom | Mean-variance linear model | ✔ | ✔ | ✖ | ✔ |
## Unhash and run the code below if you believe you may need to install the packages loaded below
#invisible(lapply(c("DESeq2", "edgeR", "tidyverse", "ape", "vegan", "GGally",
#"arrayQualityMetrics", "rgl", "adegenet", "MASS",
#"data.table", "plyr", "lmtest", "reshape2", "Rmisc", "lmerTest"),
#function(p){
#if(! p %in% rownames(installed.packages())) {
#install.packages(p)
# }
# library(p, character.only=TRUE)
#}))
# Load packages
library(DESeq2)
library(edgeR)
library(tidyverse)
library(ape)
library(vegan)
library(GGally)
library(arrayQualityMetrics)
library(rgl)
library(adegenet)
library(MASS)
library(data.table)
library(plyr)
library(lmtest)
library(reshape2)
library(Rmisc)
library(lmerTest)
# Read in matrix of RSEM expected read counts
data <- read.delim("Input_data/GE2_expected_counts_gene.matrix", sep = "\t", header = T, row.names = 1)
# Peak at data to check that it looks okay
head(data)
## B7a.genes.results B7b.genes.results B7c.genes.results
## TR100110|c0_g1_i1 4.00 10.00 3.00
## TR101578|c0_g1_i1 0.00 0.00 0.00
## TR1047|c0_g1_i1 14.00 19.00 11.00
## TR105096|c0_g1_i1 14.00 12.00 9.00
## TR107626|c1_g1_i1 4530.93 28139.91 14141.83
## TR11301|c0_g1_i1 0.00 0.00 0.00
## B12a.genes.results B12b.genes.results B12c.genes.results
## TR100110|c0_g1_i1 6.00 9.0 8.00
## TR101578|c0_g1_i1 0.00 0.0 0.00
## TR1047|c0_g1_i1 15.00 14.0 9.00
## TR105096|c0_g1_i1 15.00 5.0 10.00
## TR107626|c1_g1_i1 80846.09 5649.4 30756.08
## TR11301|c0_g1_i1 0.00 0.0 0.00
## R7a.genes.results R7b.genes.results R7c.genes.results
## TR100110|c0_g1_i1 18.0 15.00 6.00
## TR101578|c0_g1_i1 0.0 0.00 0.00
## TR1047|c0_g1_i1 16.0 16.00 27.00
## TR105096|c0_g1_i1 25.0 15.00 6.00
## TR107626|c1_g1_i1 137592.5 45299.57 14843.47
## TR11301|c0_g1_i1 0.0 0.00 0.00
## R12a.genes.results R12b.genes.results R12c.genes.results
## TR100110|c0_g1_i1 11.00 13.0 9
## TR101578|c0_g1_i1 0.00 0.0 0
## TR1047|c0_g1_i1 23.00 14.0 11
## TR105096|c0_g1_i1 14.00 45.0 12
## TR107626|c1_g1_i1 21210.52 230436.5 0
## TR11301|c0_g1_i1 0.00 0.0 0
## Y7a.genes.results Y7b.genes.results Y7c.genes.results
## TR100110|c0_g1_i1 6 4 12.0
## TR101578|c0_g1_i1 0 0 0.0
## TR1047|c0_g1_i1 20 3 18.0
## TR105096|c0_g1_i1 9 1 13.0
## TR107626|c1_g1_i1 0 0 112214.6
## TR11301|c0_g1_i1 0 0 0.0
## Y12a.genes.results Y12b.genes.results Y12c.genes.results
## TR100110|c0_g1_i1 21.00 12 12.00
## TR101578|c0_g1_i1 0.00 0 0.00
## TR1047|c0_g1_i1 25.00 14 9.00
## TR105096|c0_g1_i1 14.00 19 24.00
## TR107626|c1_g1_i1 86243.14 148100 74153.54
## TR11301|c0_g1_i1 0.00 0 0.00
# Name experimental samples: names correspond to pCO2 treatment (300, 600, 900) + days (12 h2 vs 7 days) + replicate ID
colnames(data) <- c( "300.7.a", "300.7.b", "300.7.c","300.12.a", "300.12.b", "300.12.c",
"900.7.a", "900.7.b", "900.7.c", "900.12.a", "900.12.b", "900.12.c",
"600.7.a", "600.7.b", "600.7.c", "600.12.a", "600.12.b", "600.12.c")
## Create 'targets' and 'Group dataframe, expressing experimental variables for DEG analysis
pCO2 <- as.numeric( c( 255, 255, 255, 255, 255, 255,
530, 530, 530, 530, 530, 530,
918, 918, 918, 918, 918, 918 ) )
treatment <- c( "B", "B", "B", "B", "B", "B",
"R", "R", "R", "R", "R", "R",
"Y", "Y", "Y", "Y", "Y", "Y" )
day <- as.numeric( c( 7, 7, 7, .5, .5, .5,
7, 7, 7, .5, .5, .5,
7, 7, 7, .5, .5, .5 ) )
targets <- data.frame( pCO2, day, treatment )
targets$grouping <- paste( targets$pCO2, targets$day, sep = "." )
# The group factor represents the combined levels of an experimental replicate across all variables
Group <- factor( paste( targets$day, targets$pCO2, sep = "_" ) )
# Data must be rounded to nearest integer in order to be fit for negative binomial distribution
data_input <- round( data )
# Peak at rounded data
head( data_input )
## 300.7.a 300.7.b 300.7.c 300.12.a 300.12.b 300.12.c 900.7.a
## TR100110|c0_g1_i1 4 10 3 6 9 8 18
## TR101578|c0_g1_i1 0 0 0 0 0 0 0
## TR1047|c0_g1_i1 14 19 11 15 14 9 16
## TR105096|c0_g1_i1 14 12 9 15 5 10 25
## TR107626|c1_g1_i1 4531 28140 14142 80846 5649 30756 137592
## TR11301|c0_g1_i1 0 0 0 0 0 0 0
## 900.7.b 900.7.c 900.12.a 900.12.b 900.12.c 600.7.a 600.7.b
## TR100110|c0_g1_i1 15 6 11 13 9 6 4
## TR101578|c0_g1_i1 0 0 0 0 0 0 0
## TR1047|c0_g1_i1 16 27 23 14 11 20 3
## TR105096|c0_g1_i1 15 6 14 45 12 9 1
## TR107626|c1_g1_i1 45300 14843 21211 230436 0 0 0
## TR11301|c0_g1_i1 0 0 0 0 0 0 0
## 600.7.c 600.12.a 600.12.b 600.12.c
## TR100110|c0_g1_i1 12 21 12 12
## TR101578|c0_g1_i1 0 0 0 0
## TR1047|c0_g1_i1 18 25 14 9
## TR105096|c0_g1_i1 13 14 19 24
## TR107626|c1_g1_i1 112215 86243 148100 74154
## TR11301|c0_g1_i1 0 0 0 0
# Plot distribution of unfiltered read counts across all samples
ggplot( data = data.frame( rowMeans( data_input ) ),
aes( x = rowMeans.data_input. ) ) +
geom_histogram( fill = "grey" ) +
xlim( 0, 500 ) +
theme_classic() +
labs( title = "Distribution of unfiltered reads" ) +
labs( y = "Density", x = "Raw read counts", title = "Read count distribution: untransformed, unnormalized, unfiltered" )
# Make a DGEList object for edgeR
y <- DGEList( counts = data_input, remove.zeros = TRUE )
#Let's remove samples with less than 0.5 cpm (this is ~10 counts in the count file) in fewer then 9/12 samples
keep <- rowSums( cpm( y ) > .5 ) >= 9
table( keep )
## keep
## FALSE TRUE
## 18871 62579
# Set keep.lib.sizes = F and recalculate library sizes after filtering
y <- y[ keep, keep.lib.sizes = FALSE ]
y <- calcNormFactors( y )
# Calculate logCPM
df_log <- cpm( y, log = TRUE, prior.count = 2 )
# Plot distribution of filtered logCPM values
ggplot( data = data.frame( rowMeans( df_log ) ),
aes( x = rowMeans.df_log. ) ) +
geom_histogram( fill = "grey" ) +
theme_classic() +
labs( y = "Density", x = "Filtered read counts (logCPM)",
title = "Distribution of normalized, filtered read counts" )
# Export pcoa loadings
dds.pcoa = pcoa( vegdist( t( df_log <- cpm( y, log = TRUE, prior.count = 2 ) ),
method = "euclidean") / 1000 )
# Create df of MDS vector loading
scores <- dds.pcoa$vectors
## Plot pcoa loadings of each sample, groouped by time point and pCO2 treatment
# Calculate % variation explained by each eigenvector
percent <- dds.pcoa$values$Eigenvalues
cumulative_percent_variance <- ( percent / sum( percent ) ) * 100
# Prepare information for pcoa plot, then plot
color <- c( "steelblue1", "tomato1", "goldenrod1")
par( mfrow = c( 1, 1 ) )
plot( scores[ , 1 ], scores[ , 2 ],
cex=.5, cex.axis=1, cex.lab = 1.25,
xlab = paste( "PC1, ", round( cumulative_percent_variance[ 1 ], 2 ), "%" ),
ylab = paste( "PC2, ", round( cumulative_percent_variance[ 2 ], 2 ), "%" ) )
# Add visual groupings to pcoa plot
ordihull( scores, as.factor(targets$treatment ),
border = NULL, lty = 2, lwd = .5, label = F,
col = color, draw = "polygon", alpha = 100, cex = .5 )
ordispider( scores,as.factor( targets$grouping ),label = F ) # Vectors connecting samples in same pCO2 x time group
ordilabel( scores, cex = 0.5) # Label sample IDs
logCPM.pca <- prcomp( t ( df_log ) )
logCPM.pca.proportionvariances <- ( ( logCPM.pca$sdev ^ 2 ) / ( sum( logCPM.pca$sdev ^ 2 ) ) ) * 100
## Do treatment groups fully segregate? Wrap samples by pCO2 x time, not just pCO2
# Replot using logCPM.pca
plot( logCPM.pca$x, type = "n", main = NA, xlab = paste( "PC1, ", round( logCPM.pca.proportionvariances[ 1 ], 2 ), "%" ), ylab = paste( "PC2, ", round( logCPM.pca.proportionvariances[ 2 ], 2 ), "%" ) )
points( logCPM.pca$x, col = "black", pch = 16, cex = 1 )
colors2 <- c( "steelblue1", "dodgerblue2", "tomato1", "coral", "goldenrod1", "goldenrod3" )
ordihull( logCPM.pca$x, targets$grouping,
border = NULL, lty = 2, lwd = .5,
col = colors2, draw = "polygon",
alpha = 75,cex = .5, label = T )
From Rivera et al. 2021 - "Transcriptomic resilience and timing. (a) Gene expression reaction norms of four strategies during recovery after a stressor. We use triangles again for patterns that may confer tolerance and circles for patterns associated with stress sensitivity. While all triangle paths show a return to baseline (resilience) the pink (frontloading) and yellow (dampening) are also depicting differences in baseline and plasticity and are therefore labelled differently. (b) Adapted from the rolling ball analogy commonly used for ecological resilience and depicted in Hodgson et al. (2015). Each ball represents a gene showing a color-matched expression pattern in (a). Landscapes represent expression possibilities during a stress event. In the absence of stress, the ball will settle in a trough, representing baseline expression levels. Elasticity (rate of return to the baseline) is represented by the size of the arrow (i.e., larger arrows have faster rates of return). Pink dotted line is the expression landscape for the frontloaded ball. (c) Using Torres et al. (2016) loops through disease space as an alternative framework of an organism's path through stress response and recovery. The color gradient represents the resulting phenotype for a given path through stress and recovery space, though x-and y-axis can denote any two parameters that are correlated but with a time lag."
# Square pCO2 variable
pCO2_2 <- pCO2^2
# Estimate dispersion coefficients
y1 <- estimateDisp( y, robust = TRUE ) # Estimate mean dispersal
## Design matrix not provided. Switch to the classic mode.
# Plot tagwise dispersal and impose w/ mean dispersal and trendline
plotBCV( y1 )
# Fit multifactorial design matrix
design_nl <- model.matrix( ~1 + pCO2_2 + pCO2 + pCO2_2:day + pCO2:day ) # Generate multivariate edgeR glm
# Fit quasi-likelihood, neg binom linear regression
nl_fit <- glmQLFit( y1, design_nl ) # Fit multivariate model to counts
## Test for effect of pCO2 and pCO2^2
nl_pCO2_2 <- glmQLFTest( nl_fit, coef = 2, contrast = NULL, poisson.bound = FALSE ) # Estimate significant DEGs
nl_pCO2 <- glmQLFTest( nl_fit, coef = 3, contrast = NULL, poisson.bound = FALSE ) # Estimate significant DEGs
# Make contrasts
is.de_nl_pCO2 <- decideTestsDGE( nl_pCO2, adjust.method = "fdr", p.value = 0.05 )
is.de_nl_pCO2_2 <- decideTestsDGE( nl_pCO2_2, adjust.method = "fdr", p.value = 0.05 )
# Summarize differential expression attributed to pCO2 and pCO2^2
summary( is.de_nl_pCO2 )
## pCO2
## Down 14166
## NotSig 35037
## Up 13376
summary( is.de_nl_pCO2_2 )
## pCO2_2
## Down 11091
## NotSig 39573
## Up 11915
# Plot differential expression due to pCO2 and pCO2^2
plotMD( nl_pCO2 )
plotMD( nl_pCO2_2 )
## Test for interactions between time and pCO2 or pCO2^2
nl_pCO2_int <- glmQLFTest( nl_fit, coef = 4, contrast = NULL, poisson.bound = FALSE ) # Estimate significant DEGs
nl_pCO2_2_int <- glmQLFTest( nl_fit, coef = 5, contrast = NULL, poisson.bound = FALSE ) # Estimate significant DEGs
# Make contrasts
is.de_nl_pCO2_int <- decideTestsDGE( nl_pCO2_int, adjust.method = "fdr", p.value = 0.05 ) # Make contrasts
is.de_nl_pCO2_2_int <- decideTestsDGE( nl_pCO2_2_int, adjust.method = "fdr", p.value = 0.05 )
# Summarize differential expression attributed to pCO2 and pCO2^2
summary( is.de_nl_pCO2_int )
## pCO2_2:day
## Down 2
## NotSig 62573
## Up 4
summary( is.de_nl_pCO2_2_int )
## pCO2:day
## Down 390
## NotSig 61918
## Up 271
# Plot differential expression due to pCO2 and pCO2^2
plotMD( nl_pCO2_int )
plotMD( nl_pCO2_2_int )
# Output observed
y_nl <- nl_fit$counts
# Output fitted
mu_nl <- nl_fit$fitted.values
# Output dispersion or coefficient of variation
phi_nl <- nl_fit$dispersion
# Calculate denominator
v_nl <- mu_nl*(1+phi_nl*mu_nl)
# Calculate Pearson residual
resid.pearson <- (y_nl-mu_nl) / sqrt(v_nl)
# Plot distribution of Pearson residuals
ggplot(data = melt(as.data.frame(resid.pearson)), aes(x = value)) +
geom_histogram( fill = "grey" ) +
xlim(-2.5,5.0) +
theme_classic() +
labs(title = "Distribution of negative binomial GLM residuals",
x = "Pearson residuals",
y = "Density")
## Bin transcripts based on (i) whether they have a significant positive or negative vertex and then (ii) whether they showed significant interactions between beta1 (vertex value) and time.
# Export diff expression data for transcripts with significant DE associated with PCO2^2 parameter
nl_pCO2_2_sig <- topTags(nl_pCO2_2, n = (11091 + 11915), adjust.method = "BH", p.value = 0.05)
nl_pCO2_2_sig_geneids <- row.names(nl_pCO2_2_sig) #Output a list of geneids associated with sig PCO2^2 effect
nl_pCO2_sig <- topTags(nl_pCO2, n = (14166 + 13376), adjust.method = "BH", p.value = 0.05)
nl_pCO2_sig_geneids <- row.names(nl_pCO2_sig) #Output a list of geneids associated with sig PCO2 effect
# Create tabulated dataframe of mean expression across each pCO2 level with metadata for transcript ID and timepoint
logCPM_df <- as.data.frame(df_log)
# Create tabularized df containing all replicates using 'melt' function in reshape2
logCPM_df$geneid <- row.names(logCPM_df)
tab_exp_df <- melt(logCPM_df,
id = c("geneid"))
# Add covariate information for time and pCO2
tab_exp_df$pCO2 <- substr(tab_exp_df$variable, 1, 3)
tab_exp_df$time <- as.numeric(substr(tab_exp_df$variable, 5, 5))
# Correct pCO2s to exact values
tab_exp_df$pCO2 <- as.numeric(
ifelse(tab_exp_df$pCO2 == "300", 255,
ifelse(tab_exp_df$pCO2 == "900", 930,
518)))
# Correct time to exact values
tab_exp_df$time <- as.numeric(
ifelse(tab_exp_df$time == "1", 0.5, 7))
# Create binary variable in df_all_log for significant non-linear expression
tab_exp_df$pCO2_2_sig <- ifelse(tab_exp_df$geneid %in% nl_pCO2_2_sig_geneids, "Yes", "No")
tab_exp_df$pCO2_sig <- ifelse(tab_exp_df$geneid %in% nl_pCO2_sig_geneids, "Yes", "No")
# Create a binary variable related to up or down-regulation
up_genes <- filter(nl_pCO2_sig$table, logFC > 0)
tab_exp_df$logFC_dir <- ifelse(tab_exp_df$geneid %in% row.names(up_genes), "Up", "Down")
# Add geneid to nl_pCO2_int$coefficients
nl_pCO2_int$coefficients$geneid <- row.names(nl_pCO2_int$coefficients)
# Estimate average logCPM per gene per timepoint
tab_exp_avg <- summarySE(measurevar = "value",
groupvars = c("pCO2", "time", "geneid", "pCO2_sig", "pCO2_2_sig", "logFC_dir"),
data = tab_exp_df)
# First exploratory plot of non-linear expression grouping by exposure time and direction of differential expression
ggplot(data = filter(tab_exp_avg, pCO2_2_sig == "Yes"),
aes(x = pCO2, y = value)) +
geom_path(alpha = 0.01, size = 0.25, stat = "identity", aes(group = as.factor(geneid))) +
facet_grid(logFC_dir~time) +
theme_classic() +
theme(strip.background = element_blank()) +
labs(y = "Avg logCPM", title = "Non-linear changes in GE output by edgeR")
## Using dlply, fit linear and non-linear models to each gene
# Create pCO2^2 variable in df_all_log
tab_exp_df$pCO2_2 <- tab_exp_df$pCO2^2
# Filter tabularized logCPM df for read depth of logCPM >= 0.5 in > 75% of samples
# Fit linear models - should take about 4 minutes
lms <- dlply(tab_exp_df, c("geneid"), function(df)
lm(value ~ pCO2 + time + pCO2:time, data = df))
# Fit non-linear models - should take about 2 minutes
nlms <- dlply(tab_exp_df, c("geneid"), function(df)
lm(value ~ pCO2 + pCO2_2 + time + pCO2:time + pCO2_2:time, data = df))
# Output nlm coefficients into dataframe
nlms_coeff <- ldply(nlms, coef)
head(nlms_coeff)
## geneid (Intercept) pCO2 pCO2_2 time
## 1 TR107626|c1_g1_i1 1.9361224 3.995760e-02 -3.819357e-05 3.39371568
## 2 TR141909|c0_g1_i1 2.5171022 -2.413737e-03 1.468135e-06 -0.18785174
## 3 TR141946|c0_g1_i1 -1.0829071 2.491456e-03 -3.180644e-06 -0.27604730
## 4 TR141946|c0_g1_i2 0.6669886 5.415472e-03 -3.724832e-06 0.11726202
## 5 TR141951|c0_g1_i1 2.0588461 -7.240810e-04 1.457408e-06 0.01615581
## 6 TR141972|c0_g1_i1 0.3458156 -5.983124e-05 -5.072419e-06 0.16036294
## pCO2:time pCO2_2:time
## 1 -0.0180847725 1.627584e-05
## 2 0.0007717623 -5.931927e-07
## 3 0.0008763072 -5.730126e-07
## 4 -0.0006632481 5.727356e-07
## 5 0.0000993039 -1.247330e-07
## 6 -0.0001885367 3.957829e-07
## Apply LRTs to lm's and nlm's for each transcript - should take about 2 minutes
lrts <- list() # Create list to add LRT results to
for (i in 1:length(lms)) {
lrts[[i]] <- lrtest(lms[[i]], nlms[[i]]) # Apply LRTs with for loop
}
## Filter lrt results for transcripts with significantly higher likelihoods of nl model
lrt_dfs <- list()
# Turn list of LRT outputs into list of dataframes containing output info
for (i in 1:length(lrts)) {
lrt_dfs[[i]] <- data.frame(lrts[i])
}
# Create singular dataframe with geneids and model outputs for chi-squared and LRT p-value
lrt_coeff_df <- na.omit(bind_rows(lrt_dfs, .id = "column_label")) # na.omit removes first row of each df, which lacks these data
# Add geneid based on element number from original list of LRT outputs
lrt_coeff_df <- merge(lrt_coeff_df,
data.frame(geneid = names(nlms),
column_label = as.character(seq(length(nlms)))),
by = "column_label")
# Apply FDR adjustment to LRT p-values before filtering for sig non-linear effects
lrt_coeff_df$FDR <- p.adjust(lrt_coeff_df$Pr..Chisq., method = "fdr")
# Filter LRT results for sig FDR coeff... produces 162 genes
lrt_filt <- filter(lrt_coeff_df, FDR < 0.05)
## Plot sig nl genes according to LRT, grouped by timepoint and direction of beta 1 coefficient
# Add beta coefficients to logCPM df
pCO2_pos <- filter(nlms_coeff, pCO2 > 0)
pCO2_2_pos <- filter(nlms_coeff, pCO2_2 > 0)
# Bin genes based on positive or negative pCO2 and pCO2^2 betas
tab_exp_avg$pCO2_binom <- ifelse(tab_exp_avg$geneid %in% pCO2_pos$geneid, "Positive", "Negative")
tab_exp_avg$pCO2_2_binom <- ifelse(tab_exp_avg$geneid %in% pCO2_2_pos$geneid, "Concave", "Convex")
# Filter for how many gene id's with significant likelihood of nl effect in LRT
LRT_filt_df <- filter(tab_exp_avg, geneid %in% lrt_filt$geneid)
# Plot
ggplot(data = LRT_filt_df,
aes(x = pCO2, y = value)) +
geom_path(alpha = .25, size = 0.25, stat = "identity", aes(group = as.factor(geneid))) +
facet_grid(pCO2_2_binom~time) +
geom_smooth(method = "loess", se = TRUE, span = 1) +
theme_classic() +
theme(strip.background = element_blank()) +
labs(y = "Avg logCPM", title = "Non-linear changes in GE output by LRTs")
# Count how many gene id's with significant likelihood of nl effect in LRT... 162 genes
nrow(as.data.frame(unique(LRT_filt_df$geneid)))
## [1] 162
# Filter down df for gene id's exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT
edgeR_LRT_df <- filter(tab_exp_avg, geneid %in% lrt_filt$geneid & pCO2_sig == "Yes" |
geneid %in% lrt_filt$geneid & pCO2_2_sig == "Yes")
# Plot
ggplot(data = edgeR_LRT_df,
aes(x = pCO2, y = value)) +
geom_path(alpha = .25, size = 0.25, stat = "identity", aes(group = as.factor(geneid))) +
facet_grid(pCO2_2_binom~time) +
geom_smooth(method = "loess", se = TRUE, span = 1) +
theme_classic() +
theme(strip.background = element_blank()) +
labs(y = "Avg logCPM", title = "Non-linear changes in GE output by edgeR & LRTs")
# Count how many gene id's exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT... 89 genes
nrow(as.data.frame(unique(edgeR_LRT_df$geneid)))
## [1] 89
# Export diff expression data for transcripts with significant DE associated with interaction between PCO2^2 and time
nl_pCO2_2_int_sig <- topTags(nl_pCO2_2_int, n = (390 + 271), adjust.method = "BH",p.value = 0.05)
nl_pCO2_2_int_sig_geneids <- row.names(nl_pCO2_2_int_sig) #Output a list of geneids associated with sig PCO2^2 x time interaction
# Filter down df for gene id's exhibit pCO2 significant effect in edgeR and significant likelihood of nl effect in LRT
edgeR_interaction_df <- filter(tab_exp_avg, geneid %in% nl_pCO2_2_int_sig_geneids )
edgeR_interaction_df$gene_id_time <- paste(edgeR_interaction_df$geneid,
edgeR_interaction_df$time,
sep = "_")
# Average logCPM across different groups according to pCO2^2 estimate and time
edgeR_interaction_avg <- summarySE(measurevar = "value",
groupvars = c("time", "pCO2", "pCO2_2_binom"),
data = edgeR_interaction_df)
# Plot
ggplot(data = edgeR_interaction_avg,
aes(x = pCO2, y = value, color = as.factor(time), group = as.factor(time))) +
geom_path(stat = "identity") +
geom_errorbar(aes(ymin = value - se, ymax = value + se), width = 0) +
geom_point() +
facet_wrap(~pCO2_2_binom) +
theme_classic() +
theme(strip.background = element_blank()) +
labs(y = "logCPM", color = "Time (days)", title = "Interactions between pCO2^2 and exposure time")
# Fit multifactoria design matrix that includes an interaction term for pCO2 x day
design_multi <- model.matrix( ~1 + pCO2 + pCO2:day ) #Generate multivariate edgeR glm
# Fit quasi-likelihood, neg binom linear regression
multi_fit <- glmQLFit( y1, design_multi ) # Fit multivariate model to counts
# Test for effect of pCO2
tr_pCO2 <- glmQLFTest( multi_fit, coef = 2, contrast = NULL, poisson.bound = FALSE ) # Estimate significant DEGs
is.de_tr_pCO2 <- decideTestsDGE( tr_pCO2, adjust.method = "fdr", p.value = 0.05 ) # Make contrasts
summary( is.de_tr_pCO2 )
## pCO2
## Down 0
## NotSig 62579
## Up 0
# Test for interaction between pCO2 and time
tr_int <- glmQLFTest( multi_fit, coef = 3, poisson.bound = FALSE ) # Estimate significant DEGs
is.de_int <- decideTestsDGE( tr_int, adjust.method = "fdr", p.value = 0.05 ) # Make contrasts
summary( is.de_int )
## pCO2:day
## Down 1021
## NotSig 60821
## Up 737
# Perform Voom transformation
Voom <- voom( y, design_multi, plot = T )
# Fit using Voom
lm_Voom_fit <- lmFit( Voom, design_multi )
# Create a contrast across continuous pCO2 variable
cont_pCO2 <- contrasts.fit( lm_Voom_fit, coef = "pCO2" )
# Create a contrast across interaction etween continuous pCO2 and time variables
cont_pCO2_day <- contrasts.fit( lm_Voom_fit, coef = "pCO2:day" )
# Perform empirical Bayes smoothing of standard errors
cont_pCO2 <- eBayes( cont_pCO2 )
cont_pCO2_day <- eBayes( cont_pCO2_day )
# Output test statistics
pCO2_results <- topTable( cont_pCO2, coef = "pCO2", adjust.method="fdr", n = Inf )
pCO2_day_results <- topTable( cont_pCO2_day, coef = "pCO2:day", adjust.method="fdr", n = Inf )
# How many DEGs are associated with pCO2
length( which( pCO2_results$adj.P.Val < 0.05 ) ) # number of DE genes =
## [1] 0
# How many DEGs are associated with pCO2:day?
length( which( pCO2_day_results$adj.P.Val < 0.05 ) )
## [1] 1628
gcounts <- as.data.frame(data_input)
totalCounts <- colSums(gcounts)
### REMOVE GENES WITH LOW MEAN COUNTS ###
# Make a DGEList object for edgeR
y <- DGEList(counts = data_input, remove.zeros = TRUE)
# Let's remove samples with less then 0.5 cpm (this is ~10 counts in the count file) in fewer then 9/12 samples
keep_g <- rowSums(cpm( gcounts ) > .5) >= 9
table(keep_g)
## keep_g
## FALSE TRUE
## 20632 62579
# Set keep.lib.sizes = F and recalculate library sizes after filtering
#gcounts <- gcounts[ keep_g, keep.lib.sizes = FALSE ]
### BUILD A DATAFRAME ASSOCIATING SAMPLE NAMESWITH TREATMENT CONDITIONS ###
targets
## pCO2 day treatment grouping
## 1 255 7.0 B 255.7
## 2 255 7.0 B 255.7
## 3 255 7.0 B 255.7
## 4 255 0.5 B 255.0.5
## 5 255 0.5 B 255.0.5
## 6 255 0.5 B 255.0.5
## 7 530 7.0 R 530.7
## 8 530 7.0 R 530.7
## 9 530 7.0 R 530.7
## 10 530 0.5 R 530.0.5
## 11 530 0.5 R 530.0.5
## 12 530 0.5 R 530.0.5
## 13 918 7.0 Y 918.7
## 14 918 7.0 Y 918.7
## 15 918 7.0 Y 918.7
## 16 918 0.5 Y 918.0.5
## 17 918 0.5 Y 918.0.5
## 18 918 0.5 Y 918.0.5
### WALD TEST - FULL MODEL ###
dds <- DESeqDataSetFromMatrix(gcounts,
colData = targets,
design = formula( ~ 1 + pCO2 + day : pCO2))
rld <- rlog(dds)
rld.df <- assay(rld)
# Wald test for pCO2:day
dds_int <- DESeq(dds, minReplicatesForReplace = Inf)
design <- design( dds_int )
DESeq2_int_result_names <- resultsNames(dds_int)
# Count DEGs due to interaction
DESeq2_int_results <- results(dds_int, name = "pCO2.day", lfcThreshold = 0, alpha = 0.05)
summary(DESeq2_int_results)
##
## out of 81450 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2491, 3.1%
## LFC < 0 (down) : 2494, 3.1%
## outliers [1] : 1373, 1.7%
## low counts [2] : 2724, 3.3%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Merge logFC and pval data from each program
Voom_edgeR_deseq_int_comp <- merge(
merge(
data.frame(geneid = row.names(pCO2_day_results),
Voom_logFC = pCO2_day_results$logFC,
Voom_pval = pCO2_day_results$P.Value),
data.frame(geneid = row.names(tr_int$table),
edgeR_logFC = tr_int$table$logFC, #negate logFC because of syntax differences
edgeR_pval = tr_int$table$PValue),
by = "geneid" ),
data.frame(geneid = row.names(DESeq2_int_results),
DESeq2_logFC = DESeq2_int_results$log2FoldChange,
DESeq2_pval = DESeq2_int_results$pvalue),
by = "geneid")
# Create neg log pvalues
Voom_edgeR_deseq_int_comp$Voom_neglogp <- -log(Voom_edgeR_deseq_int_comp$Voom_pval)
Voom_edgeR_deseq_int_comp$edgeR_neglogp <- -log(Voom_edgeR_deseq_int_comp$edgeR_pval)
Voom_edgeR_deseq_int_comp$DESeq2_neglogp <- -log(Voom_edgeR_deseq_int_comp$DESeq2_pval)
# Create function for adding 1:1 trendline on ggpairs plot and plotting geom_hex instead of geom_point
my_fn <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_hex( bins = 100,
aes(fill = stat(log(count))), alpha = 1 ) +
scale_fill_viridis_c() +
geom_abline(slope=1, intercept = 0, color = "red", lty = 2, size = 1, alpha = 0.5)
p
}
# Correlation matrix of pvalues
pval_pairs <- ggpairs(data = Voom_edgeR_deseq_int_comp,
columns = c(8, 9, 10),
mapping = aes(alpha = 0.001),
lower = list(continuous = my_fn)) +
labs(title = "Correlation matrix: interaction p-values")
pval_pairs +
theme_classic(base_rect_size = 0)
# Correlation matrix of logFC's
logFC_pairs <- ggpairs(data = Voom_edgeR_deseq_int_comp,
columns = c( 2, 4, 6 ),
mapping = aes( alpha = 0.001 ),
lower = list(continuous = my_fn)) +
labs(title = "Correlation matrix: interaction logFC's")
logFC_pairs +
theme_classic(base_rect_size = 0)
We will skip edgeR and DESeq2 since they cannot fit random effects.
# Fit multifactoria design matrix
design_rand <- model.matrix( ~1 + pCO2 + ( 1 | day ) ) #Generate multivariate edgeR glm
# Perform Voom transformation
Voom_rand <- voom( y, design_rand, plot = T )
## Coefficients not estimable: 1 | dayTRUE
# Fit using Voom
lm_Voom_fit_rand <- lmFit( Voom_rand, design_rand )
## Coefficients not estimable: 1 | dayTRUE
# Create a contrast across continuous pCO2 variable
cont_rand_day <- contrasts.fit( lm_Voom_fit_rand, coef = "pCO2" )
# Perform empirical Bayes smoothing of standard errors
cont_rand_day <- eBayes( cont_rand_day )
# Output test statistics
rand_results <- topTable( cont_rand_day, coef = "pCO2", adjust.method = "fdr", n = Inf )
# How many DEG are associated with pCO2 after incorporating a random effect for day?
length( which( rand_results$adj.P.Val < 0.05 ) ) # number of DE genes
## [1] 3
## For practical purposes, reduce size of input data in order to run lmer in for loop without using up memory
# Re-filter data
keep_red <- rowSums(cpm(y) > 3 & cpm(y) < 10) >= 12
# Apply read filter to tab_exp_df
tab_exp_df_filt <- filter(tab_exp_df, geneid %in%
row.names(filter(as.data.frame(keep_red), keep_red == TRUE)))
# Using dlply, fit linear mixed model to tabularized df of log2-transformed CPM values for each transcript
tab_exp_df_filt$time <- as.factor(tab_exp_df_filt$time)
rs_lmes <- dlply(tab_exp_df_filt, c("geneid"), function(df)
lmer(value ~ pCO2 + (1 | time) + (pCO2 | time), data = df))
# Fit null model without random slope
null_lmes <- dlply(tab_exp_df_filt, c("geneid"), function(df)
lmer(value ~ pCO2 + (1 | time), data = df))
# Apply LRT to each gene to test for effect of random slope
lmer_lrts <- list() # Create list to add LRT results to
for (i in 1:length(rs_lmes)) {
lmer_lrts[[i]] <- lrtest(rs_lmes[[i]], null_lmes[[i]]) # Apply LRTs with for loop
}
## Filter lrt results for transcripts with significantly higher likelihoods than null model
lmer_lrt_dfs <- list()
# Turn list of LRT outputs into list of dataframes containing output info
for (i in 1:length(lmer_lrts)) {
lmer_lrt_dfs[[i]] <- data.frame(lmer_lrts[i])
}
# Create singular dataframe with geneids and model outputs for chi-squared and LRT p-value
lmer_lrt_coeff_df <- na.omit(bind_rows(lmer_lrt_dfs, .id = "column_label")) # na.omit removes first row of each df, which lacks these data
# Add geneid based on element number from original list of LRT outputs
lmer_lrt_coeff_df <- merge(lmer_lrt_coeff_df,
data.frame(geneid = names(rs_lmes),
column_label = as.character(seq(length(rs_lmes)))),
by = "column_label")
# Apply FDR adjustment to LRT p-values before filtering for sig non-linear effects
lmer_lrt_coeff_df$FDR <- p.adjust(lmer_lrt_coeff_df$Pr..Chisq., method = "fdr")
# Filter LRT results for sig FDR coeff...
lmer_lrt_filt <- filter(lmer_lrt_coeff_df, FDR < 0.05)
# How many genes showed significant LRT?... none!
count(lmer_lrt_filt)
## [1] column_label X.Df LogLik Df Chisq
## [6] Pr..Chisq. geneid FDR freq
## <0 rows> (or 0-length row.names)
Bogan, S. N., Johnson, K. M., & Hofmann, G. E. (2020). Changes in Genome-Wide methylation and gene expression in response to future pCO2 extremes in the antarctic pteropod limacina helicina antarctica. Frontiers in Marine Science, NA.
Hawinkel, S., Rayner, J. C. W., Bijnens, L., & Thas, O. (2020). Sequence count data are poorly fit by the negative binomial distribution. PLoS One, 15(4), e0224909.
Hoffman, G. E., & Roussos, P. (2021). Dream: Powerful differential expression analysis for repeated measures designs. Bioinformatics, 37(2), 192–201.
Johnson, K. M., & Hofmann, G. E. (2016). A transcriptome resource for the antarctic pteropod limacina helicina antarctica. Mar. Genomics, 28, 25–28.
Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014). Voom: Precision weights unlock linear model analysis tools for RNA-seq read counts. Genome Biol., 15(2), R29.
Li, B., & Dewey, C. N. (2011). RSEM: Accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics, 12, 323.
Love, M. I., Huber, W., & Anders, S. (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol., 15(12), 550.
Rivera, H. E., Aichelman, H. E., Fifer, J. E., Kriefall, N. G., Wuitchik, D. M., Wuitchik, S. J. S., & Davies, S. W. (2021). A framework for understanding gene expression plasticity and its influence on stress tolerance. Mol. Ecol., 30(6), 1381–1397.
Robinson, M. D., McCarthy, D. J., & Smyth, G. K. (2010). EdgeR: A bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics, 26(1), 139–140.
Schloerke, B., Crowley, J., Cook, D., Briatte, F., Marbach, M., & others. (2018). Ggally: Extension to ggplot2. R Package Version.